# To help with reading and manipulating data
import pandas as pd
pd.set_option('display.html.use_mathjax', False)
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
ConfusionMatrixDisplay,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
# %load_ext nb_black
#Imblearn library is used to handle imbalanced data
!pip install imblearn --user
!pip install imbalanced-learn --user
Collecting imblearn Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes) Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.10/dist-packages (from imblearn) (0.12.3) Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.25.2) Requirement already satisfied: scipy>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.13.1) Requirement already satisfied: scikit-learn>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.3.2) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (3.5.0) Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB) Installing collected packages: imblearn Successfully installed imblearn-0.0 Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.10/dist-packages (0.12.3) Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn) (1.25.2) Requirement already satisfied: scipy>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn) (1.13.1) Requirement already satisfied: scikit-learn>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn) (1.3.2) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn) (3.5.0)
#Read Data into System
bank = pd.read_csv('BankChurners.csv')
#Get rows and columns of the data
bank.shape
(10127, 21)
data = bank.copy()
#View first 5 records of data
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
#View last 5 records of data
data.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
#Check data types by column
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
#Check for data duplication
data.duplicated().sum()
0
# let's check for missing values in the data
round(data.isnull().sum() / data.isnull().count() * 100, 2)
CLIENTNUM 0.000 Attrition_Flag 0.000 Customer_Age 0.000 Gender 0.000 Dependent_count 0.000 Education_Level 15.000 Marital_Status 7.400 Income_Category 0.000 Card_Category 0.000 Months_on_book 0.000 Total_Relationship_Count 0.000 Months_Inactive_12_mon 0.000 Contacts_Count_12_mon 0.000 Credit_Limit 0.000 Total_Revolving_Bal 0.000 Avg_Open_To_Buy 0.000 Total_Amt_Chng_Q4_Q1 0.000 Total_Trans_Amt 0.000 Total_Trans_Ct 0.000 Total_Ct_Chng_Q4_Q1 0.000 Avg_Utilization_Ratio 0.000 dtype: float64
#Check for null values in the dataset
data.isna().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
data.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
# let's view the statistical summary of the numerical columns in the data
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.000 | 739177606.334 | 36903783.450 | 708082083.000 | 713036770.500 | 717926358.000 | 773143533.000 | 828343083.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.000 | 2.346 | 1.299 | 0.000 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.000 | 3.813 | 1.554 | 1.000 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
#User defined function for obtaining the distribution of the categorical variables
def categorical_analysis(data):
for col in data:
print(data[col].value_counts(normalize = True))
print("-"*25)
categorical_col = []
for i in range(data.shape[1]):
if data.dtypes[i] == 'object':
categorical_col.append(data.dtypes.index[i])
print(f'The categorical features of the data are: {categorical_col}')
The categorical features of the data are: ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
categorical_analysis(data[categorical_col])
Attrition_Flag Existing Customer 0.839 Attrited Customer 0.161 Name: proportion, dtype: float64 ------------------------- Gender F 0.529 M 0.471 Name: proportion, dtype: float64 ------------------------- Education_Level Graduate 0.363 High School 0.234 Uneducated 0.173 College 0.118 Post-Graduate 0.060 Doctorate 0.052 Name: proportion, dtype: float64 ------------------------- Marital_Status Married 0.500 Single 0.420 Divorced 0.080 Name: proportion, dtype: float64 ------------------------- Income_Category Less than $40K 0.352 $40K - $60K 0.177 $80K - $120K 0.152 $60K - $80K 0.138 abc 0.110 $120K + 0.072 Name: proportion, dtype: float64 ------------------------- Card_Category Blue 0.932 Silver 0.055 Gold 0.011 Platinum 0.002 Name: proportion, dtype: float64 -------------------------
*Interestingly, 36.3% of customers, a majority, have a graduate level Education Level while also 35.2% of customers, a majority, earn less than \$40K -- a result that should be investigated to understand a somewhat contradictory finding.
*Lesser income in spite of high proportion of graduate degrees can possibly be explained by a majority female customer population.
*Overwhelming majority of customers have what is assumed to be a lower ranked card with possible a lower credit limit, the so-called "Blue" card category -- this would make sense with a majority Graduate customer population as these customers likely carry debt along with making less than $40K although, interestingly, the majority of customers are married and so it assumed that income is not joint, but individual.
#User defined function for batch univariate data analysis of quantitative features in a dataset
def univariate_analysis(data, feature, include_def = False):
"""
This function computes major and minor outliers
as defined in 'Introduction to Contemporary Statistical Methods -- second edition' by Dr. Lambert H. Koopmans (pg. 53).
Minor outliers are defined by those values which are beyond Q3/Q1 +/- 1.5*IQR but less than Q3/Q1 +/- 3*IQR
Major/Extreme outliers are defined by those values which are beyond Q3/Q1 +/- 3*IQR
Also, this function computes other relevant statistics and plots germane to a univariate analysis of a random variable.
Please note the following definitions/descriptions of skewness and kurtosis from the Engineering Statistics Handbook (https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm):
*Skewness is a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.
**Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails, or outliers.
**Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case.
"""
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis
##Display Plots
fig, axes = plt.subplots(ncols = 3, figsize = (15,5))
sns.boxplot(data = data, x = feature, ax = axes[0]).set(title = f'Boxplot of {feature}')
sns.histplot(data = data, x = feature, ax = axes[1]).set(title = f'Histogram plot of {feature}')
sns.kdeplot(data = data, x = feature, ax = axes[2]).set(title = f'Density plot of {feature}')
plt.show()
Q1 = data[feature].quantile(.25) ; Q3 = data[feature].quantile(.75) ; Q2 = data[feature].quantile(.5) ; Mean = data[feature].mean()
STD = np.std(data[feature]); SNR = Mean/STD ; CV = STD/Mean ; Skew = skew(data[feature], nan_policy = 'omit') ; Kurtosis = kurtosis(data[feature], nan_policy = 'omit')
IQR = Q3 - Q1
upper_fence = Q3 + 1.5*IQR ; lower_fence = Q1 - 1.5*IQR
outer_upper_fence = Q3 + 3*IQR ; outer_lower_fence = Q1 - 3*IQR
minor_outliers_Q3 = []
major_outliers_Q3 = []
minor_outliers_Q1 = []
major_outliers_Q1 = []
for i in range(data[(data[feature]>upper_fence) & (data[feature]<=outer_upper_fence)].shape[0]):
minor_outliers_Q3.append(data[(data[feature]>upper_fence) & (data[feature]<=outer_upper_fence)].loc[:,feature].values[i])
if len(minor_outliers_Q3) == 0:
print('*There are no minor outliers beyond Quartile-3 + 1.5*IQR!')
else:
print('*The number of minor outliers beyond Quartile-3 + 1.5*IQR are:', len(minor_outliers_Q3), 'and the average of these outliers are:', round(pd.Series(minor_outliers_Q3).mean(), ndigits=2))
for i in range(data[data[feature]> outer_upper_fence].shape[0]):
major_outliers_Q3.append(data[data[feature]> outer_upper_fence].loc[:,feature].values[i])
if len(major_outliers_Q3) == 0:
print('*There are no major outliers beyond Quartile-3 + 3*IQR!')
else:
print('*The number major outliers beyond Quartile-3 + 3*IQR are:', len(major_outliers_Q3), 'and the average of these outliers are:', round(pd.Series(major_outliers_Q3).mean(), ndigits=2))
for i in range(data[(data[feature]>= outer_lower_fence) & (data[feature]<lower_fence)].shape[0]):
minor_outliers_Q1.append(data[(data[feature]>= outer_lower_fence) & (data[feature]<lower_fence)].loc[:,feature].values[i])
if len(minor_outliers_Q1) == 0:
print('*There are no minor outliers below Quartile-1 - 1.5*IQR!')
else:
print('*The number of minor outliers below Quartile-1 - 1.5*IQR are:', len(minor_outliers_Q1), 'and the average of these outliers are:', round(pd.Series(minor_outliers_Q1).mean(), ndigits=2))
for i in range(data[(data[feature]<outer_lower_fence)].shape[0]):
major_outliers_Q1.append(data[(data[feature]<outer_lower_fence)].loc[:,feature].values[i])
if len(major_outliers_Q1) == 0:
print('*There are no major outliers below Quartile-1 - 3*IQR!')
else:
print('*The number major outliers below Quartile-1 - 3*IQR are:', len(major_outliers_Q1), 'and the average of these outliers are:', round(pd.Series(major_outliers_Q1).mean(), ndigits=2))
print('-'*50)
print('-'*50)
print('*Quartile-1 is:', round(Q1, ndigits=2))
print('*Quartile-2 (median) is:', round(Q2, ndigits = 2))
print('*Quartile-3 is:', round(Q3, ndigits = 2))
print('*Quartile-1 - 1.5*IQR = ', round(lower_fence, ndigits = 2))
print('*Quartile-1 - 3*IQR = ', round(outer_lower_fence, ndigits=2))
print('*Quartile-3 + 1.5*IQR = ', round(upper_fence, ndigits=2))
print('*Quartile-3 + 3*IQR = ', round(outer_upper_fence, ndigits=2))
print('-'*50)
print('-'*50)
print(f'*Average of {feature} is:', round(Mean, ndigits=2))
print(f'*Standard deviation of {feature} is:', round(STD, ndigits=2))
print(f'*Skewness of {feature} is:', round(Skew, ndigits=2))
print(f'*Kurtosis of {feature} is:', round(Kurtosis, ndigits = 2))
print(f'*Coefficient of variation of {feature} is:', round(CV, ndigits = 2))
if data[feature].min()>=0:
print(f'*Signal to Noise Ratio of {feature} is:', round(SNR, ndigits=2))
print('**Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative**')
else:
print('*Signal to Noise Ratio DOES NOT EXIST')
print('-'*50)
print('-'*50)
if include_def == True:
print('**Definitions:')
print(f'*Skewness is a measure of symmetry, or more precisely, the lack of symmetry. \n A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.')
print(f'*Kurtosis is a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. \n That is, data sets with high kurtosis tend to have heavy tails, or outliers. \n Data sets with low kurtosis tend to have light tails, or lack of outliers. A uniform distribution would be the extreme case.')
numeric_col = []
for i in range(data.shape[1]):
if data.dtypes[i] == 'int64' or data.dtypes[i] == 'float64':
numeric_col.append(data.dtypes.index[i])
print(f'The numeric features of the data are: {numeric_col}')
The numeric features of the data are: ['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
#Batch unviariate analysis for quantitative variables
for i in numeric_col:
univariate_analysis(data, feature = i)
*There are no minor outliers beyond Quartile-3 + 1.5*IQR! *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 713036770.5 *Quartile-2 (median) is: 717926358.0 *Quartile-3 is: 773143533.0 *Quartile-1 - 1.5*IQR = 622876626.75 *Quartile-1 - 3*IQR = 532716483.0 *Quartile-3 + 1.5*IQR = 863303676.75 *Quartile-3 + 3*IQR = 953463820.5 -------------------------------------------------- -------------------------------------------------- *Average of CLIENTNUM is: 739177606.33 *Standard deviation of CLIENTNUM is: 36901961.36 *Skewness of CLIENTNUM is: 1.0 *Kurtosis of CLIENTNUM is: -0.62 *Coefficient of variation of CLIENTNUM is: 0.05 *Signal to Noise Ratio of CLIENTNUM is: 20.03 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 2 and the average of these outliers are: 71.5 *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 41.0 *Quartile-2 (median) is: 46.0 *Quartile-3 is: 52.0 *Quartile-1 - 1.5*IQR = 24.5 *Quartile-1 - 3*IQR = 8.0 *Quartile-3 + 1.5*IQR = 68.5 *Quartile-3 + 3*IQR = 85.0 -------------------------------------------------- -------------------------------------------------- *Average of Customer_Age is: 46.33 *Standard deviation of Customer_Age is: 8.02 *Skewness of Customer_Age is: -0.03 *Kurtosis of Customer_Age is: -0.29 *Coefficient of variation of Customer_Age is: 0.17 *Signal to Noise Ratio of Customer_Age is: 5.78 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*There are no minor outliers beyond Quartile-3 + 1.5*IQR! *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 1.0 *Quartile-2 (median) is: 2.0 *Quartile-3 is: 3.0 *Quartile-1 - 1.5*IQR = -2.0 *Quartile-1 - 3*IQR = -5.0 *Quartile-3 + 1.5*IQR = 6.0 *Quartile-3 + 3*IQR = 9.0 -------------------------------------------------- -------------------------------------------------- *Average of Dependent_count is: 2.35 *Standard deviation of Dependent_count is: 1.3 *Skewness of Dependent_count is: -0.02 *Kurtosis of Dependent_count is: -0.68 *Coefficient of variation of Dependent_count is: 0.55 *Signal to Noise Ratio of Dependent_count is: 1.81 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 198 and the average of these outliers are: 55.25 *There are no major outliers beyond Quartile-3 + 3*IQR! *The number of minor outliers below Quartile-1 - 1.5*IQR are: 188 and the average of these outliers are: 14.74 *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 31.0 *Quartile-2 (median) is: 36.0 *Quartile-3 is: 40.0 *Quartile-1 - 1.5*IQR = 17.5 *Quartile-1 - 3*IQR = 4.0 *Quartile-3 + 1.5*IQR = 53.5 *Quartile-3 + 3*IQR = 67.0 -------------------------------------------------- -------------------------------------------------- *Average of Months_on_book is: 35.93 *Standard deviation of Months_on_book is: 7.99 *Skewness of Months_on_book is: -0.11 *Kurtosis of Months_on_book is: 0.4 *Coefficient of variation of Months_on_book is: 0.22 *Signal to Noise Ratio of Months_on_book is: 4.5 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*There are no minor outliers beyond Quartile-3 + 1.5*IQR! *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 3.0 *Quartile-2 (median) is: 4.0 *Quartile-3 is: 5.0 *Quartile-1 - 1.5*IQR = 0.0 *Quartile-1 - 3*IQR = -3.0 *Quartile-3 + 1.5*IQR = 8.0 *Quartile-3 + 3*IQR = 11.0 -------------------------------------------------- -------------------------------------------------- *Average of Total_Relationship_Count is: 3.81 *Standard deviation of Total_Relationship_Count is: 1.55 *Skewness of Total_Relationship_Count is: -0.16 *Kurtosis of Total_Relationship_Count is: -1.01 *Coefficient of variation of Total_Relationship_Count is: 0.41 *Signal to Noise Ratio of Total_Relationship_Count is: 2.45 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 302 and the average of these outliers are: 5.41 *There are no major outliers beyond Quartile-3 + 3*IQR! *The number of minor outliers below Quartile-1 - 1.5*IQR are: 29 and the average of these outliers are: 0.0 *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 2.0 *Quartile-2 (median) is: 2.0 *Quartile-3 is: 3.0 *Quartile-1 - 1.5*IQR = 0.5 *Quartile-1 - 3*IQR = -1.0 *Quartile-3 + 1.5*IQR = 4.5 *Quartile-3 + 3*IQR = 6.0 -------------------------------------------------- -------------------------------------------------- *Average of Months_Inactive_12_mon is: 2.34 *Standard deviation of Months_Inactive_12_mon is: 1.01 *Skewness of Months_Inactive_12_mon is: 0.63 *Kurtosis of Months_Inactive_12_mon is: 1.1 *Coefficient of variation of Months_Inactive_12_mon is: 0.43 *Signal to Noise Ratio of Months_Inactive_12_mon is: 2.32 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 230 and the average of these outliers are: 5.23 *There are no major outliers beyond Quartile-3 + 3*IQR! *The number of minor outliers below Quartile-1 - 1.5*IQR are: 399 and the average of these outliers are: 0.0 *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 2.0 *Quartile-2 (median) is: 2.0 *Quartile-3 is: 3.0 *Quartile-1 - 1.5*IQR = 0.5 *Quartile-1 - 3*IQR = -1.0 *Quartile-3 + 1.5*IQR = 4.5 *Quartile-3 + 3*IQR = 6.0 -------------------------------------------------- -------------------------------------------------- *Average of Contacts_Count_12_mon is: 2.46 *Standard deviation of Contacts_Count_12_mon is: 1.11 *Skewness of Contacts_Count_12_mon is: 0.01 *Kurtosis of Contacts_Count_12_mon is: 0.0 *Coefficient of variation of Contacts_Count_12_mon is: 0.45 *Signal to Noise Ratio of Contacts_Count_12_mon is: 2.22 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 984 and the average of these outliers are: 31551.34 *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 2555.0 *Quartile-2 (median) is: 4549.0 *Quartile-3 is: 11067.5 *Quartile-1 - 1.5*IQR = -10213.75 *Quartile-1 - 3*IQR = -22982.5 *Quartile-3 + 1.5*IQR = 23836.25 *Quartile-3 + 3*IQR = 36605.0 -------------------------------------------------- -------------------------------------------------- *Average of Credit_Limit is: 8631.95 *Standard deviation of Credit_Limit is: 9088.33 *Skewness of Credit_Limit is: 1.67 *Kurtosis of Credit_Limit is: 1.81 *Coefficient of variation of Credit_Limit is: 1.05 *Signal to Noise Ratio of Credit_Limit is: 0.95 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*There are no minor outliers beyond Quartile-3 + 1.5*IQR! *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 359.0 *Quartile-2 (median) is: 1276.0 *Quartile-3 is: 1784.0 *Quartile-1 - 1.5*IQR = -1778.5 *Quartile-1 - 3*IQR = -3916.0 *Quartile-3 + 1.5*IQR = 3921.5 *Quartile-3 + 3*IQR = 6059.0 -------------------------------------------------- -------------------------------------------------- *Average of Total_Revolving_Bal is: 1162.81 *Standard deviation of Total_Revolving_Bal is: 814.95 *Skewness of Total_Revolving_Bal is: -0.15 *Kurtosis of Total_Revolving_Bal is: -1.15 *Coefficient of variation of Total_Revolving_Bal is: 0.7 *Signal to Noise Ratio of Total_Revolving_Bal is: 1.43 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 963 and the average of these outliers are: 30531.66 *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 1324.5 *Quartile-2 (median) is: 3474.0 *Quartile-3 is: 9859.0 *Quartile-1 - 1.5*IQR = -11477.25 *Quartile-1 - 3*IQR = -24279.0 *Quartile-3 + 1.5*IQR = 22660.75 *Quartile-3 + 3*IQR = 35462.5 -------------------------------------------------- -------------------------------------------------- *Average of Avg_Open_To_Buy is: 7469.14 *Standard deviation of Avg_Open_To_Buy is: 9090.24 *Skewness of Avg_Open_To_Buy is: 1.66 *Kurtosis of Avg_Open_To_Buy is: 1.8 *Coefficient of variation of Avg_Open_To_Buy is: 1.22 *Signal to Noise Ratio of Avg_Open_To_Buy is: 0.82 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 258 and the average of these outliers are: 1.34 *The number major outliers beyond Quartile-3 + 3*IQR are: 90 and the average of these outliers are: 1.83 *The number of minor outliers below Quartile-1 - 1.5*IQR are: 48 and the average of these outliers are: 0.18 *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 0.63 *Quartile-2 (median) is: 0.74 *Quartile-3 is: 0.86 *Quartile-1 - 1.5*IQR = 0.29 *Quartile-1 - 3*IQR = -0.05 *Quartile-3 + 1.5*IQR = 1.2 *Quartile-3 + 3*IQR = 1.54 -------------------------------------------------- -------------------------------------------------- *Average of Total_Amt_Chng_Q4_Q1 is: 0.76 *Standard deviation of Total_Amt_Chng_Q4_Q1 is: 0.22 *Skewness of Total_Amt_Chng_Q4_Q1 is: 1.73 *Kurtosis of Total_Amt_Chng_Q4_Q1 is: 9.99 *Coefficient of variation of Total_Amt_Chng_Q4_Q1 is: 0.29 *Signal to Noise Ratio of Total_Amt_Chng_Q4_Q1 is: 3.47 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 159 and the average of these outliers are: 9313.23 *The number major outliers beyond Quartile-3 + 3*IQR are: 737 and the average of these outliers are: 14731.74 *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 2155.5 *Quartile-2 (median) is: 3899.0 *Quartile-3 is: 4741.0 *Quartile-1 - 1.5*IQR = -1722.75 *Quartile-1 - 3*IQR = -5601.0 *Quartile-3 + 1.5*IQR = 8619.25 *Quartile-3 + 3*IQR = 12497.5 -------------------------------------------------- -------------------------------------------------- *Average of Total_Trans_Amt is: 4404.09 *Standard deviation of Total_Trans_Amt is: 3396.96 *Skewness of Total_Trans_Amt is: 2.04 *Kurtosis of Total_Trans_Amt is: 3.89 *Coefficient of variation of Total_Trans_Amt is: 0.77 *Signal to Noise Ratio of Total_Trans_Amt is: 1.3 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 2 and the average of these outliers are: 138.5 *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 45.0 *Quartile-2 (median) is: 67.0 *Quartile-3 is: 81.0 *Quartile-1 - 1.5*IQR = -9.0 *Quartile-1 - 3*IQR = -63.0 *Quartile-3 + 1.5*IQR = 135.0 *Quartile-3 + 3*IQR = 189.0 -------------------------------------------------- -------------------------------------------------- *Average of Total_Trans_Ct is: 64.86 *Standard deviation of Total_Trans_Ct is: 23.47 *Skewness of Total_Trans_Ct is: 0.15 *Kurtosis of Total_Trans_Ct is: -0.37 *Coefficient of variation of Total_Trans_Ct is: 0.36 *Signal to Noise Ratio of Total_Trans_Ct is: 2.76 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*The number of minor outliers beyond Quartile-3 + 1.5*IQR are: 218 and the average of these outliers are: 1.3 *The number major outliers beyond Quartile-3 + 3*IQR are: 80 and the average of these outliers are: 2.02 *The number of minor outliers below Quartile-1 - 1.5*IQR are: 96 and the average of these outliers are: 0.15 *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 0.58 *Quartile-2 (median) is: 0.7 *Quartile-3 is: 0.82 *Quartile-1 - 1.5*IQR = 0.23 *Quartile-1 - 3*IQR = -0.13 *Quartile-3 + 1.5*IQR = 1.17 *Quartile-3 + 3*IQR = 1.53 -------------------------------------------------- -------------------------------------------------- *Average of Total_Ct_Chng_Q4_Q1 is: 0.71 *Standard deviation of Total_Ct_Chng_Q4_Q1 is: 0.24 *Skewness of Total_Ct_Chng_Q4_Q1 is: 2.06 *Kurtosis of Total_Ct_Chng_Q4_Q1 is: 15.68 *Coefficient of variation of Total_Ct_Chng_Q4_Q1 is: 0.33 *Signal to Noise Ratio of Total_Ct_Chng_Q4_Q1 is: 2.99 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
*There are no minor outliers beyond Quartile-3 + 1.5*IQR! *There are no major outliers beyond Quartile-3 + 3*IQR! *There are no minor outliers below Quartile-1 - 1.5*IQR! *There are no major outliers below Quartile-1 - 3*IQR! -------------------------------------------------- -------------------------------------------------- *Quartile-1 is: 0.02 *Quartile-2 (median) is: 0.18 *Quartile-3 is: 0.5 *Quartile-1 - 1.5*IQR = -0.7 *Quartile-1 - 3*IQR = -1.42 *Quartile-3 + 1.5*IQR = 1.22 *Quartile-3 + 3*IQR = 1.94 -------------------------------------------------- -------------------------------------------------- *Average of Avg_Utilization_Ratio is: 0.27 *Standard deviation of Avg_Utilization_Ratio is: 0.28 *Skewness of Avg_Utilization_Ratio is: 0.72 *Kurtosis of Avg_Utilization_Ratio is: -0.8 *Coefficient of variation of Avg_Utilization_Ratio is: 1.0 *Signal to Noise Ratio of Avg_Utilization_Ratio is: 1.0 **Note: Signal to Noise Ratio statistic is only applicable to variables which are always non-negative** -------------------------------------------------- --------------------------------------------------
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=45, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
#Batch categorical variable analysis
for i in categorical_col:
labeled_barplot(data[categorical_col], feature = i)
sns.set(rc={"figure.figsize": (10, 7)})
sns.pairplot(data, hue = "Attrition_Flag", diag_kind = 'hist')
<seaborn.axisgrid.PairGrid at 0x7b81cdb58430>